In [1]:
from preamble import *
% matplotlib notebook
In [2]:
from glob import glob
dfs = []
for file in glob("data/citibike/*.csv"):
dfs.append(pd.read_csv(file))
In [3]:
data = pd.concat(dfs)
In [4]:
data.columns
Out[4]:
In [5]:
data.head()
Out[5]:
In [6]:
data['one'] = 1
data['starttime'] = pd.to_datetime(data.starttime)
data = data.set_index("starttime")
In [7]:
data_resampled = data.groupby("start station id").one.resample("3h").sum()
In [8]:
per_station = data_resampled.unstack(level=0).fillna(0)
In [9]:
plt.figure()
per_station[301].plot()
Out[9]:
In [10]:
from sklearn.gaussian_process import GaussianProcessRegressor
In [11]:
y = per_station[301].values
X = np.arange(len(y)).reshape(-1, 1)
In [12]:
gp = GaussianProcessRegressor().fit(X, y)
In [13]:
plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()
Out[13]:
In [14]:
gp.kernel_
Out[14]:
In [18]:
from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, WhiteKernel
gp = GaussianProcessRegressor(alpha=1, normalize_y=True,
kernel = 1.0 * RBF(length_scale_bounds=(2, 500)) + 1.0 * RBF(length_scale_bounds=(50, 1000))
+ 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=8, periodicity_bounds="fixed") # + 1.0 * WhiteKernel(noise_level=1)
+ 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=56, periodicity_bounds="fixed")).fit(X[:1500], y[:1500])
In [20]:
plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()
Out[20]:
In [21]:
gp.kernel_
Out[21]: